notebook.community

Edit and run



In [1]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline



In [2]:

    
from munging import session
from munging import transform


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression



In [23]:

    
## load data
data = pd.read_csv("data/amazon_employee_access/train.csv")
## make ids recognized as categorical data
for f in data.columns[1:]:
    data[f] = data[f].astype(np.str)
data.head(3)









    Out[23]:






  
    
      
      ACTION
      RESOURCE
      MGR_ID
      ROLE_ROLLUP_1
      ROLE_ROLLUP_2
      ROLE_DEPTNAME
      ROLE_TITLE
      ROLE_FAMILY_DESC
      ROLE_FAMILY
      ROLE_CODE
    
  
  
    
      0
       1
       39353
       85475
       117961
       118300
       123472
       117905
       117906
       290919
       117908
    
    
      1
       1
       17183
        1540
       117961
       118343
       123125
       118536
       118536
       308574
       118539
    
    
      2
       1
       36724
       14457
       118219
       118220
       117884
       117879
       267952
        19721
       117880



In [25]:

    
## exploration session
dsession = session.Session(data, "ACTION", random_state=0)
transformers = []
print dsession.get_parameters()









    



{'SKEWNESS_THR': 20, 'FRAC_OF_NA_TO_IGNORE': 0.95, 'FRAC_OF_FEAT_TO_BE_NONINFORMATIVE': 0.96, 'REDUNDANT_FEAT_CORR_THR': 0.95, 'MIN_NUM_VALUES_FOR_NUMERICAL': 5}



In [26]:

    
## numerical and categorical features
numerical_feats = dsession.get_features_of(dsession.is_numerical_feature)
categorical_feats = dsession.get_features_of(dsession.is_categorical_feature)
print len(numerical_feats)
print len(categorical_feats)

0
9



In [28]:

    
## knowing what you are dealing with
pd.value_counts(data.ACTION) * 1./ data.shape[0]









    Out[28]:





1    0.94211
0    0.05789
dtype: float64



In [ ]:

	ACTION	RESOURCE	MGR_ID	ROLE_ROLLUP_1	ROLE_ROLLUP_2	ROLE_DEPTNAME	ROLE_TITLE	ROLE_FAMILY_DESC	ROLE_FAMILY	ROLE_CODE
0	1	39353	85475	117961	118300	123472	117905	117906	290919	117908
1	1	17183	1540	117961	118343	123125	118536	118536	308574	118539
2	1	36724	14457	118219	118220	117884	117879	267952	19721	117880